from dataprocess.PDFTextExtractor import PDFTextExtractor
from dataprocess.elastic.EsSinker import EsSinker
from fulltext.Document import Document
from Logger import Logger

logger = Logger.get_logger(__name__)

class PdfProcessor(EsSinker):
    """Process PDF documents and index them in Elasticsearch"""
    
    def run(self, messages: dict) -> bool:
        """
        Process and index document content
        
        Args:
            file_path: file instance to process
            
        Returns:
            bool: True if successful, False otherwise
        """
        try:
            file_path = messages['file_path']
            with PDFTextExtractor(file_path) as extractor:
                merged_pages = extractor.get_merged_pages_text()
                document = Document.create(file_path=file_path, text_content=merged_pages)
                return self.index_document(document)
        except Exception as e:
            logger.error(f"处理文档失败: {str(e)}")
            return False
